#Libraries
#The libraries necessary for the Project
library(e1071)
library(textstem)
library(udpipe)
library(sentimentr)
library(ggraph)
library(wordcloud)
library(wordcloud2)
library(tidygraph)
library(viridisLite)
library(visNetwork)
library(tidyverse)
library(textdata)
library(tidytext)
library(vosonSML)
library(igraph)
library(SentimentAnalysis)
library(SnowballC)
library(dplyr)
library(tm)
library(stringr)
library(ggplot2)
library(ggpmisc)
library(psych)
#Getting the Data ##Youtube API
# Getting the API Key from API file
file_path <- "API_Key.txt"
# Accessing the Comments from the API of Youtube
API <- read.table(file_path, header = F, sep = "\t")
youtubeAuth = Authenticate("youtube", apiKey = API[1,1])
videoID = "https://www.youtube.com/watch?v=VGWGcESPltM"
youtubeData = youtubeAuth |>
Collect(videoID, maxComments = 111000, verbose = FALSE, writeToFile = FALSE)
rm(youtubeAuth)
rm(videoID)
##Saving the Original Youtube Data
write.csv(youtubeData, "YoutubeData.csv", row.names = T)
##Reading the Original Youtube Data
youtubeData <- read.csv("YoutubeData.csv")
##Taking Care of Variable Types
#Changing the variable type of the columns to fit the Analysis
youtubeData$ReplyCount = as.numeric(youtubeData$ReplyCount)
youtubeData$Comment = enc2utf8(youtubeData$Comment)
youtubeData$LikeCount = as.numeric(youtubeData$LikeCount)
##Removing Useless columns
#Columns to Remove
columns_to_remove <- c('X', 'AuthorProfileImageUrl', 'AuthorChannelUrl', 'AuthorChannelID', 'UpdatedAt', 'VideoID')
# Removing the Useless Columns
youtubeData <- youtubeData[, !(names(youtubeData) %in% columns_to_remove)]
colnames(youtubeData)
## [1] "Comment" "AuthorDisplayName" "ReplyCount"
## [4] "LikeCount" "PublishedAt" "CommentID"
## [7] "ParentID"
rm(columns_to_remove)
#Descriptive Analysis ##Reply Count Distribution
ggplot(youtubeData, aes(x = ReplyCount)) +
geom_histogram(binwidth = 4, fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = "Distribution of ReplyCount",
x = "ReplyCount",
y = "Frequency")
ggplot(youtubeData, aes(x = ReplyCount)) +
geom_histogram(binwidth = 3, fill = "skyblue", color = "black", alpha = 0.7) +
scale_y_log10() +
labs(title = "Distribution of ReplyCount (log scale)",
x = "ReplyCount",
y = "Frequency (log scale)")+
theme(axis.title = element_text(size = 14))
#In this code, the scale_y_log10() function is added to the ggplot object, which transforms the y-axis to a logarithmic scale. This can help in better visualizing the distribution when there are a large number of values close to zero.
##LikeCount Distribution
ggplot(youtubeData, aes(x = LikeCount)) +
geom_histogram(binwidth = .1, fill = "skyblue", color = "black", alpha = 0.7) +
scale_x_log10() +
labs(title = "Distribution of Like Count (log scale)",
x = "LikeCount",
y = "Frequency (log scale)") +
theme(axis.title = element_text(size = 14))
##Numerical Variables Summary
columns_of_interest <- c("ReplyCount", "LikeCount")
# Get a numerical summary for each numerical column
for (column in columns_of_interest) {
cat("Summary for column:", column, "\n")
print(summary(youtubeData[[column]]))
cat("\n")
}
## Summary for column: ReplyCount
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2573 0.0000 485.0000
##
## Summary for column: LikeCount
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 11.23 1.00 117913.00
rm(columns_of_interest)
rm(column)
##Authors Frequency
#Get the frequency of each unique value in the "AuthorDisplayName" column and
#Sort the Authors of the Comments in a frequency table
sorted_value_counts <- table(youtubeData$AuthorDisplayName) %>% sort(decreasing = TRUE) %>% head(20)
# Create a data frame from the sorted frequency table
df <- data.frame(AuthorDisplayName = names(sorted_value_counts),
Frequency = sorted_value_counts)
rm(sorted_value_counts)
# Create a bar plot
ggplot(df, aes(x = reorder(AuthorDisplayName, -Frequency.Freq), y = Frequency.Freq)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Frequency of Author Display Names",
x = "Author Display Name",
y = "Frequency")
rm(df)
##Relationship Between Likes and Replies
ggplot(youtubeData, aes(x=LikeCount, y=ReplyCount)) +
geom_hex(bins=20, color="white") +
scale_fill_gradient(low = "green", high="red") +
geom_smooth(method=lm, formula = y ~ poly(x,2)) +
stat_poly_eq(aes(label=paste(..eq.label.., ..adj.rr.label.., sep="~~~~")),
formula = y ~ poly(x,2), parse = T) +
labs(title="Density plot of Reply Counts and Comment Likes") +
theme(axis.title = element_text(size = 14))
#Sentiment Analysis ##Cleaning the Text ###Remove NAs
#Removing NA comments if any
youtubeData <- youtubeData %>%
filter(!is.na(Comment))
###Lower Case
#making the text all lower case
youtubeData$Comment <- tolower(youtubeData$Comment)
###Word Frequency Before Further Cleaning
#Let's see the most frequent words before removing the stop words
words = tibble(line = 1:nrow(youtubeData), text = youtubeData$Comment) %>% unnest_tokens(word,text)
counts = words %>% count(word, sort=T)
# Select the top 20 words
top20 <- counts %>% slice_head(n = 20)
# Create the ggplot
ggplot(top20, aes(x = reorder(word, n), y = n)) +
geom_col() +
labs(y = NULL) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
rm(counts)
rm(top20)
###Word Cloud Before Further Cleaning
#Transform the bar plot to word cloud for better visualization
words %>%
count(word, sort = TRUE) %>%
head(100) %>%
wordcloud2(size = .8, shape = 'triangle-forward',
color = c("steelblue", "firebrick", "darkorchid"),
backgroundColor = "white")
rm(words)
###Stop words
#Removing the Stop Words
youtubeData$Comment <- removeWords(youtubeData$Comment, stopwords("en"))
###Custom Stop words
#Removing the Custom Stop Words
custom_stopwords = c("s","t","m","re","don", "doesn", "don", "ve",
"didn", "u", "isn", "u","p", "{", "}", "\\","'")
youtubeData$Comment <- removeWords(youtubeData$Comment, custom_stopwords)
rm(custom_stopwords)
###Lemmitization
youtubeData$Comment = lemmatize_words(youtubeData$Comment)
###Tokenization
SentimentData = tibble(line = 1:nrow(youtubeData), text = youtubeData$Comment)
SentimentData_tokens = SentimentData %>% unnest_tokens(word,text)
SentimentData_tokens %>% count(word,sort=T)
## # A tibble: 46,931 × 2
## word n
## <chr> <int>
## 1 piers 50099
## 2 tate 34063
## 3 andrew 26253
## 4 like 15979
## 5 just 14200
## 6 man 12470
## 7 interview 11987
## 8 people 11147
## 9 morgan 11019
## 10 can 10373
## # ℹ 46,921 more rows
###Word Frequency After Further Cleaning
counts = SentimentData_tokens %>%
count(word, sort=T)
# Select the top 20 words
top20 <- counts %>% slice_head(n = 20)
print(top20)
## # A tibble: 20 × 2
## word n
## <chr> <int>
## 1 piers 50099
## 2 tate 34063
## 3 andrew 26253
## 4 like 15979
## 5 just 14200
## 6 man 12470
## 7 interview 11987
## 8 people 11147
## 9 morgan 11019
## 10 can 10373
## 11 women 8758
## 12 say 7471
## 13 authority 7252
## 14 think 6926
## 15 get 6901
## 16 men 6805
## 17 said 6733
## 18 one 6688
## 19 let 6277
## 20 woman 6017
# Create the ggplot
ggplot(top20, aes(x = reorder(word, n), y = n)) +
geom_col() +
labs(y = NULL) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
rm(top20)
rm(counts)
###Word Cloud After Further Cleaning
#Transform the bar plot to word cloud for better visualization
SentimentData_tokens %>%
count(word, sort = TRUE) %>%
head(100) %>%
wordcloud2(size = 1, shape = 'triangle-forward',
color = c("steelblue", "firebrick", "darkorchid"),
backgroundColor = "white")
rm(SentimentData)
rm(SentimentData_tokens)
##Words Graph ###Creating the Words Graph Dataset
#Here the goal is to classify the comments that talk about Andrew Tate, and the ones that talk about Piers Morgan.
Result <- data.frame(Name = rep(NA, nrow(youtubeData)),
Attribute = rep(NA, nrow(youtubeData)))
for (i in 1:nrow(youtubeData)) {
if ((grepl('piers', youtubeData$Comment[i], ignore.case = TRUE) |
grepl('pierce', youtubeData$Comment[i], ignore.case = TRUE) |
grepl('morgan', youtubeData$Comment[i], ignore.case = TRUE)) &
(grepl('andrew', youtubeData$Comment[i], ignore.case = TRUE) |
grepl('tate', youtubeData$Comment[i], ignore.case = TRUE) |
grepl('top', youtubeData$Comment[i], ignore.case = TRUE) |
grepl('tate.', youtubeData$Comment[i], ignore.case = TRUE) |
grepl(' g ', youtubeData$Comment[i], ignore.case = TRUE))){
next
}
if (grepl('piers', youtubeData$Comment[i], ignore.case = TRUE) |
grepl('pierce', youtubeData$Comment[i], ignore.case = TRUE) |
grepl('morgan', youtubeData$Comment[i], ignore.case = TRUE)) {
Result$Name[i] <- 'Piers Morgan'
Result$Attribute[i] <- youtubeData$Comment[i]
}
if (grepl('andrew', youtubeData$Comment[i], ignore.case = TRUE) |
grepl('tate', youtubeData$Comment[i], ignore.case = TRUE) |
grepl('top', youtubeData$Comment[i], ignore.case = TRUE) |
grepl('tate.', youtubeData$Comment[i], ignore.case = TRUE) |
grepl('topg', youtubeData$Comment[i], ignore.case = TRUE) |
grepl(' g ', youtubeData$Comment[i], ignore.case = TRUE)) {
Result$Name[i] <- 'Andrew Tate'
Result$Attribute[i] <- youtubeData$Comment[i]
}
}
Result <- Result[complete.cases(Result$Name), ]
rm(i)
###Cleaning the Words Geaph Dataset
#To remove double spaces
Result$Attribute <- gsub("\\s+", " ", Result$Attribute)
#To remove the space at the beginning of the comment
Result$Attribute <- gsub("^\\s+", "", Result$Attribute)
# Function to split attributes and count frequency
unwanted_values <- c("morgan", "tate", "top", "g","piers","andrew","topg", "pierce")
result_data = Result %>%
separate_rows(Attribute, sep = ' ') %>%
mutate(Attribute = str_trim(Attribute)) %>% na.omit() %>% filter(Attribute != "") %>% filter(!Attribute %in% unwanted_values)
rm(unwanted_values)
rm(Result)
#To get the frequency of the words and to which name they are attached
result_data_final <- result_data %>%
group_by(Name, Attribute) %>%
summarise(Frequency = n()) %>%
top_n(100, Frequency)
rm(result_data)
#Remove Useless words
stop_words <- c(
"just", "can", "people", "just", "don", "say", "think", "said", "can",
"get", "let", "will", "one", "even", "way", "make", "things", "know",
"says", "doesn", "saying", "let", "one", "don", "trying", "get", "time",
"want", "now", "see", "trying", "say", "someone", "also", "go", "every",
"many", "got", "look", "thing", "even", "actaully", "got", "ve",
"someone", "make", "said", "still", "show", "every", "something",
"will", "didn", "lot", "way", "u", "going", "everything", "back",
"much", "word", "always", "give", "actually", "saying", "see",
"made", "keep", "whole", "video", "now", "isn", "go", "years",
"u", "us", "ever", "words", "dont", "really", "watch", "person", "point",
"pierce", "tates","pierse","pierce.", "andrews","tate.","don"
)
unique_stop_words <- unique(stop_words)
# Remove rows where the Attribute column contains any stop word
result_data_filtered <- result_data_final %>%
filter(!grepl(paste(unique_stop_words, collapse="|"), Attribute, ignore.case=TRUE))
rm(result_data_final)
rm(unique_stop_words)
rm(stop_words)
#Final Data for the word graph
result_df <- aggregate(Frequency ~ Name + Attribute, data = result_data_filtered, sum)
result_df <- result_df %>%
filter(nchar(Attribute) > 1)
rm(result_data_filtered)
###Plot of Words Graph
#Plot
list_network = graph.data.frame(result_df, directed = F)
plot(list_network,vertex.size=10,vertex.label.cex=.8)
rm(result_df)
# Assigning colors to edges based on frequency
edge_colors <- ifelse(E(list_network)$Frequency > 1000, "red", "gray")
# Assigning colors to the two individuals
node_colors <- ifelse(V(list_network)$name %in% c("Andrew Tate", "Piers Morgan"), "blue", "green")
# Plotting the network with customized colors
plot(
list_network,
vertex.size = 12,
vertex.label.cex = 0.8,
edge.color = edge_colors,
vertex.color = node_colors
)
rm(list_network)
rm(node_colors)
rm(edge_colors)
##Creating Sentiment Analysis Data
#Using the Sentiment Analysis Library
sentences <- get_sentences(youtubeData$Comment)
sentiment_scores <- sentiment(sentences)
result <- sentiment_scores %>%
group_by(element_id) %>%
summarize(sentiment = mean(sentiment), word_count = sum(word_count))
rm(sentences)
rm(sentiment_scores)
# Combine sentiment scores with the original data frame
youtubeData <- cbind(youtubeData, result)
rm(result)
columns_to_remove = c('element_id','sentence_id')
youtubeData <- youtubeData[, !(colnames(youtubeData) %in% columns_to_remove)]
rm(columns_to_remove)
###Descriptive Analysis of Sentiment Scores
#Distribution of Sentiment Scores
ggplot(youtubeData, aes(x = sentiment)) +
geom_histogram(aes(y = ..density..), binwidth = 0.05, fill = "skyblue", color = "black", alpha = 0.7) +
stat_function(fun = dnorm, args = list(mean = mean(youtubeData$sentiment), sd = sd(youtubeData$sentiment)), color = "red", size = 1) +
labs(title = "Sentiment Histogram",
x = "Sentiment",
y = "Density") +
theme_minimal() +
theme(axis.title = element_text(size = 14)) +
xlim(c(-1, 1))
#Skewness
skewness_value <- skewness(youtubeData$sentiment)
print(skewness_value)
## [1] 0.05120366
rm(skewness_value)
positive_values <- sum(youtubeData$sentiment > 0)
negative_values <- sum(youtubeData$sentiment < 0)
# Calculate the percentage
percentage_positive <- (positive_values / (positive_values + negative_values)) * 100
cat("Percentage of positive values over negative values:", percentage_positive, "%\n")
## Percentage of positive values over negative values: 51.65105 %
rm(positive_values)
rm(negative_values)
rm(percentage_positive)
#Scatterplot of Sentiment by Date
ggplot(youtubeData, aes(x = PublishedAt, y = sentiment, color = sentiment)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(title = "Scatterplot of Sentiment by Date") +
scale_color_gradient(low = "red", high = "green")
#Plot by Likes
ggplot(youtubeData, aes(x=LikeCount, y=sentiment)) +
geom_hex(bins=20, color="white") +
scale_fill_gradient(low = "green", high="red") +
geom_smooth(method=lm, formula = y ~ poly(x,2)) +
labs(title="Density plot of Sentiment in Youtube Video by Comment Likes")
#Correlation between likes and sentiment
correlation_value <- cor(youtubeData$sentiment, youtubeData$LikeCount, method = "spearman")
cat("Correlation between LikeCount and the sentiment score:", correlation_value, "\n")
## Correlation between LikeCount and the sentiment score: 0.07446133
rm(correlation_value)
#Plot by Reply Count
ggplot(youtubeData, aes(x=ReplyCount, y=sentiment)) +
geom_hex(bins=20, color="white") +
scale_fill_gradient(low = "green", high="red") +
geom_smooth(method=lm, formula = y ~ poly(x,2)) +
labs(title="Density plot of Sentiment in Youtube Video by Comment Replies")
#Correlation between sentiment and replies
correlation_value <- cor(youtubeData$sentiment, youtubeData$ReplyCount, method = "spearman")
cat("Correlation between LikeCount and the sentiment score:", correlation_value, "\n")
## Correlation between LikeCount and the sentiment score: 0.03431116
rm(correlation_value)
#Mean and Median of the sentiment
mean(youtubeData$sentiment,na.rm=T)
## [1] 0.008401548
median(youtubeData$sentiment,na.rm=T)
## [1] 0
##Sentiment Analysis on all the comments
#Transforming the data drom individual comments to a unified text
all_comments = tibble(line = 1:nrow(youtubeData), text = youtubeData$Comment)
all_comments = all_comments %>% unnest_tokens(word,text)
#Counting the frequency
all_comments %>% count(word,sort=T)
## # A tibble: 46,931 × 2
## word n
## <chr> <int>
## 1 piers 50099
## 2 tate 34063
## 3 andrew 26253
## 4 like 15979
## 5 just 14200
## 6 man 12470
## 7 interview 11987
## 8 people 11147
## 9 morgan 11019
## 10 can 10373
## # ℹ 46,921 more rows
#Using the nrc library to get sentiment on the comments
sentiments = get_sentiments("nrc")
sentiments %>% count(sentiment,sort=T)
## # A tibble: 10 × 2
## sentiment n
## <chr> <int>
## 1 negative 3316
## 2 positive 2308
## 3 fear 1474
## 4 anger 1245
## 5 trust 1230
## 6 sadness 1187
## 7 disgust 1056
## 8 anticipation 837
## 9 joy 687
## 10 surprise 532
#Combining the comments with their respective sentiments
all_comments_sentiment = all_comments %>% inner_join(sentiments)
rm(sentiments)
rm(all_comments)
all_comments_sentiment %>% count(sentiment,sort=T)
## # A tibble: 10 × 2
## sentiment n
## <chr> <int>
## 1 positive 139938
## 2 negative 111924
## 3 trust 91315
## 4 anger 57574
## 5 anticipation 56019
## 6 fear 55769
## 7 sadness 50091
## 8 joy 43772
## 9 disgust 39005
## 10 surprise 23132
# color mapping for each sentiment
color_mapping <- c(
"positive" = "green",
"trust" = "#2EB62C",
"anticipation" = "#83D475",
"joy" = "#ABE098",
"surprise" = "#C5E8B7",
"disgust" = "#EA4C46",
"negative" = "red",
"fear" = "#F6BDC0",
"sadness" = "#F1959B",
"anger" = "#F07470"
)
#Histogram
all_comments_sentiment %>%
count(sentiment) %>%
ggplot(aes(reorder(sentiment, -n), n, fill = sentiment)) +
geom_col(color = "black") +
scale_fill_manual(values = color_mapping, guide = FALSE) +
labs(title = "Distribution of Different Emotions",
x = "Sentiments",
y="Frequency")
rm(color_mapping)
#Word Cloud for negative words
all_comments_sentiment_pivot = all_comments_sentiment %>% count(word,sentiment,sort=T) %>% pivot_wider(names_from = sentiment, values_from=n,values_fill=0)
all_comments_sentiment_pivot %>% with(wordcloud(word,negative,min.freq = 200,random.order = F,colors = brewer.pal(8, "PuOr")))
#Word Cloud for positive words
all_comments_sentiment_pivot %>% with(wordcloud(word,positive,min.freq = 200,random.order = F,colors = brewer.pal(8, "PuOr")))
rm(all_comments_sentiment)
rm(all_comments_sentiment_pivot)
#Network Graph ##Make the Data less voluminous
youtubeData_igraph = youtubeData
youtubeData_igraph$CommentID <- sub(".*\\.(.*)", "\\1", youtubeData_igraph$CommentID)
youtubeData_igraph <- youtubeData_igraph[order(-youtubeData_igraph$ReplyCount), ]
# Selecting the top values of the "CommentID" by ReplyCount
top_comments <- head(youtubeData_igraph$CommentID, 14)
#Remove the parent comments that have less than 40 reply count
youtubeData_igraph <- youtubeData_igraph[!(youtubeData_igraph$ReplyCount < 100 & is.na(youtubeData_igraph$ParentID)), ]
#Remove the replies to the comments that are not in the top comments
youtubeData_igraph <- youtubeData_igraph[youtubeData_igraph$ParentID %in% top_comments | is.na(youtubeData_igraph$ParentID), ]
rm(top_comments)
##igraph ###Creating The Nodes / Edges
#Get the type of the comments
type_list = c()
num_rows = nrow(youtubeData_igraph)
for (i in 1:num_rows){
if (!is.na(youtubeData_igraph[i,7])){
type_list <- append(type_list, "Reply")
} else {
type_list <- append(type_list, "Parent")
}
}
rm(i)
rm(num_rows)
###Creating the igraph object
library(igraph)
g = graph.empty(directed = T)
#Nodes
for (i in youtubeData_igraph$AuthorDisplayName) {
g = g + vertex(i)
}
# Add edges with weights
num_rows <- nrow(youtubeData_igraph)
for (i in 1:num_rows) {
if (!is.na(youtubeData_igraph[i, 7])) {
matching_rows <- which(youtubeData_igraph$CommentID == youtubeData_igraph[i, 7])
for (j in matching_rows) {
g <- g + edge(youtubeData_igraph[i, 2], youtubeData_igraph[j, 2], weight = youtubeData_igraph[i,4])
}
}
}
rm(i)
rm(j)
rm(matching_rows)
rm(num_rows)
###Plots of igraph object
#Simple Plot
g
## IGRAPH c6c3e92 DNW- 295 280 --
## + attr: name (v/c), weight (e/n)
## + edges from c6c3e92 (vertex names):
## [1] @Baron_von_Fargone->@jamesedginton9822
## [2] @williamking5063 ->@jamesedginton9822
## [3] @sylviabriggs4087 ->@jamesedginton9822
## [4] @happydaysx335 ->@jamesedginton9822
## [5] @oasis808 ->@jamesedginton9822
## [6] @rochanahirun5490 ->@jamesedginton9822
## [7] @microsoftuser5415->@jamesedginton9822
## [8] @_Mango609 ->@jamesedginton9822
## + ... omitted several edges
plot.igraph(g,vertex.label = NA, vertex.size = 6)
# Plot with different layout
plot(g, layout = layout_nicely(g), vertex.label = NA, vertex.size = 8,edge.color = E(g)$weight)
#Plot with circle layout
l <- layout_in_circle(g)
plot(g, layout=l, vertex.label = NA, vertex.size=5,edge.color = E(g)$weight)
#Plot with sphere layout
l <- layout_on_sphere(g)
plot(g, layout=l,vertex.size = 7, vertex.label = NA)
# Heatmap of the network matrix:
netm <- get.adjacency(g, attr="weight", sparse=F)
colnames(netm) <- V(g)$media
rownames(netm) <- V(g)$media
palf <- colorRampPalette(c("gold", "dark orange"))
heatmap(netm[,17:1], Rowv = NA, Colv = NA, col = palf(20),
scale="none", margins=c(10,10) )
rm(netm)
rm(palf)
#Assigning colors to different types of comments
colors <- ifelse(type_list == "Parent", "red", "green")
types <- ifelse(type_list == "Parent", "circle", "square")
V(g)$color <- colors
V(g)$shape <- types
rm(types)
rm(colors)
#Plot
g
## IGRAPH c6c3e92 DNW- 295 280 --
## + attr: name (v/c), color (v/c), shape (v/c), weight (e/n)
## + edges from c6c3e92 (vertex names):
## [1] @Baron_von_Fargone->@jamesedginton9822
## [2] @williamking5063 ->@jamesedginton9822
## [3] @sylviabriggs4087 ->@jamesedginton9822
## [4] @happydaysx335 ->@jamesedginton9822
## [5] @oasis808 ->@jamesedginton9822
## [6] @rochanahirun5490 ->@jamesedginton9822
## [7] @microsoftuser5415->@jamesedginton9822
## [8] @_Mango609 ->@jamesedginton9822
## + ... omitted several edges
plot.igraph(g,vertex.label = NA, vertex.size = 6)
# Plot with "layout_nicely"
plot(g, layout = layout_nicely(g), vertex.label = NA, vertex.size = 8,edge.color = E(g)$weight)
## Warning in layout_nicely(g): Non-positive edge weight found, ignoring all
## weights during graph layout.
#Plot with circle layout
l <- layout_in_circle(g)
plot(g, layout=l, vertex.label = NA, vertex.size=5,edge.color = E(g)$weight)
#plot with sphere layout
l <- layout_on_sphere(g)
plot(g, layout=l,vertex.size = 7, vertex.label = NA)
rm(l)
###Attributes of the Graph
#Mean Edge weight
mean(edge_attr(g)$weight)
## [1] 97.63929
#Diameter
diameter(g, directed=F)
## [1] 5190
diam <- get_diameter(g, directed=F)
diam
## + 3/295 vertices, named, from c6c3e92:
## [1] @calebgao6931 @waffilushus @dvandeun
rm(diam)
deg <- degree(g, mode = "all")
# Order vertices by degree
sorted_vertices <- order(deg, decreasing = TRUE)
# Select the top 10 vertices
top_10_vertices <- head(sorted_vertices, 10)
# Display the top 10 vertices and their degrees
result <- data.frame(Vertex = top_10_vertices, Degree = deg[top_10_vertices])
print(result)
## Vertex Degree
## @pushinp2300 3 23
## @jeffcajou76 7 21
## @waffilushus 1 20
## @CharLie_69 2 20
## @Jasejamaica 4 20
## @AlasdairMacgillivray 5 20
## @Aerish369 6 20
## @justforfunxxD 8 20
## @moe17 9 20
## @Blurhornet 10 20
rm(result)
rm(sorted_vertices)
rm(top_10_vertices)
# Calculate the mean degree
mean_degree <- mean(deg)
# Calculate the median degree
median_degree <- median(deg)
# Display the mean and median degree
print(paste("Mean Degree:", mean_degree))
## [1] "Mean Degree: 1.89830508474576"
print(paste("Median Degree:", median_degree))
## [1] "Median Degree: 1"
rm(mean_degree)
rm(median_degree)
#Degree Distribution
degree_dist <- degree_distribution(g)
# Convert degree distribution to a data frame for ggplot
degree_df <- as.data.frame(table(degree_dist))
# Plot the degree distribution with ggplot2
ggplot(degree_df, aes(x = as.numeric(degree_dist), y = Freq)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
labs(title = "Degree Distribution",
x = "Degree",
y = "Frequency") +
theme_minimal()
rm(degree_df)
rm(degree_dist)
rm(deg)
#Average path length, the mean of the shortest distance between each pair of nodes in the network
mean_distance(g, directed=F)
## [1] 315.239
#Some of the edges
head(as_edgelist(g, names=T))
## [,1] [,2]
## [1,] "@Baron_von_Fargone" "@jamesedginton9822"
## [2,] "@williamking5063" "@jamesedginton9822"
## [3,] "@sylviabriggs4087" "@jamesedginton9822"
## [4,] "@happydaysx335" "@jamesedginton9822"
## [5,] "@oasis808" "@jamesedginton9822"
## [6,] "@rochanahirun5490" "@jamesedginton9822"
#Edge density
edge_density(g, loops=F)
## [1] 0.00322841
#Reciprocity
reciprocity(g)
## [1] 0
###Cloesness
head(closeness(g, mode="all", weights=NA),10)
## @waffilushus @CharLie_69 @pushinp2300
## 0.050000000 0.052631579 0.006578947
## @Jasejamaica @AlasdairMacgillivray @Aerish369
## 0.055555556 0.014285714 0.100000000
## @jeffcajou76 @justforfunxxD @moe17
## 0.058823529 0.008620690 0.006410256
## @Blurhornet
## 0.050000000
#Make the igraph object to tbl for the next plot
g <- as_tbl_graph(g)
g
## # A tbl_graph: 295 nodes and 280 edges
## #
## # A directed multigraph with 52 components
## #
## # A tibble: 295 × 3
## name color shape
## <chr> <chr> <chr>
## 1 @waffilushus red circle
## 2 @CharLie_69 red circle
## 3 @pushinp2300 red circle
## 4 @Jasejamaica red circle
## 5 @AlasdairMacgillivray red circle
## 6 @Aerish369 red circle
## # ℹ 289 more rows
## #
## # A tibble: 280 × 3
## from to weight
## <int> <int> <dbl>
## 1 16 13 1
## 2 17 13 10
## 3 18 13 80
## # ℹ 277 more rows
#Simple plot to see the structure of the network
g %>%
activate(nodes) %>%
ggraph(layout = 'stress') +
geom_edge_fan(width = .8, color = 'lightblue') +
coord_fixed() +
theme_graph()
#Getting the initial Data
youtubeData_igraph = youtubeData
youtubeData_igraph$CommentID <- sub(".*\\.(.*)", "\\1", youtubeData_igraph$CommentID)
youtubeData_igraph <- youtubeData_igraph[order(-youtubeData_igraph$ReplyCount), ]
# Selecting the top values of the "CommentID" by ReplyCount
top_comments <- head(youtubeData_igraph$CommentID, 64)
#Remove the parent comments that have less than 40 reply count
youtubeData_igraph <- youtubeData_igraph[!(youtubeData_igraph$ReplyCount < 40 & is.na(youtubeData_igraph$ParentID)), ]
#Remove the replies to the comments that are not in the top comments
youtubeData_igraph <- youtubeData_igraph[youtubeData_igraph$ParentID %in% top_comments | is.na(youtubeData_igraph$ParentID), ]
rm(top_comments)
#Creating the dataset for the VisNetwork graph
from_list = c()
to_list = c()
weight_list = c()
num_rows = nrow(youtubeData_igraph)
for (i in 1:num_rows){
if (!is.na(youtubeData_igraph[i,7])){
matching_rows <- which(youtubeData_igraph$CommentID == youtubeData_igraph[i,7])
for (j in matching_rows){
from_list <- append(from_list, youtubeData_igraph[i, 2])
to_list <- append(to_list, youtubeData_igraph[j, 2])
weight_list <- append(weight_list, youtubeData_igraph[i,4])
}} else {
from_list <- append(from_list, youtubeData_igraph[i, 2])
to_list <- append(to_list, youtubeData_igraph[i, 2])
weight_list <- append(weight_list, youtubeData_igraph[i,3])
}
}
rm(num_rows)
rm(j)
rm(i)